library(mice)
library(tidyverse)
train <- read.csv("../clean_data/mci_wv1go.csv")
test <- read.csv("../clean_data/mci_wv23.csv")

Visualization

Overall missing patterns of different waves are quite different. I will impute the train & test set seperately to avoid information leakage

df_bar <- data.frame(variable=names(train), train=colMeans(is.na(train)), test=colMeans(is.na(test)) ) %>%
  pivot_longer(cols=c("train", "test"), names_to = "set", values_to = "Missingness")
ggplot(df_bar) +
  geom_bar(aes(x=reorder(variable, desc(Missingness)), y=Missingness, fill=set), position="dodge", stat = "identity")+
  xlab("Feature") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

# md.pattern(train, rotate.names = T)
# md.pattern(test, rotate.names = T)
fluxplot(train)

fluxplot(test)

Imputation for training

pred = quickpred(train)
imp = mice(train, seed=1, m=5, maxit=5, pred=pred, printFlag = F, ridge=0.001)  
## Warning: Number of logged events: 1444
# > # default error: probably due to linear combination according to the warning 
# > set higher ridge to address collinearity
# imp$method    # all use pmm
# diagnostic:
bwplot(imp, layout = c(3, 1))

stripplot(imp, pch = c(21, 20), cex = c(1, 1.5), layout = c(3, 1))

# export:
train_imp <- complete(imp, m=5)[[1]]
write.csv(train_imp, file = "../clean_data/mci_wv1go_imp.csv")

Imputation for testing

pred = quickpred(test)
imp = mice(test, seed=1, m=5, maxit=5, pred=pred, printFlag = F, ridge=0.001)  
## Warning: Number of logged events: 1211
# > # default error: probably due to linear combination according to the warning 
# > set higher ridge to address collinearity
# imp$method    # all use pmm
# diagnostic:
bwplot(imp, layout = c(3, 1))

stripplot(imp, pch = c(21, 20), cex = c(1, 1.5), layout = c(3, 1))

# export:
test_imp <- complete(imp, m=5)[[1]]
write.csv(test_imp, file = "../clean_data/mci_wv23_imp.csv")